In ecohealthalliance/gains-summary: Simple summaries of GAINS data.

knitr::opts_knit$set(root.dir = "~/Dropbox (EHA)/repositories/gains-summary")
knitr::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE)

# Load required packages
library(plyr)
library(dplyr)
library(magrittr)

# Load China data
china_full <- read.csv("inst/rawdata/CHINA.csv", as.is = TRUE)
keeps <- c(1:4, 7, 8, 14, 16:20, 23, 24, 28, 40, 49:52, 55,
    56, 60, 137, 138, 139)
china <- china_full[, keeps]
names(china)

# Load Kevin's data
kevin <- read.csv("inst/rawdata/KevinP1rank.csv", as.is = TRUE)
names(kevin)

# Transform virus name variables so they're matchable.
library(stringr)
kevin$virusname <- str_replace_all(kevin$virusname, " ", "")

select_first <- function(x) {
    str_split(x, "\\|")[[1]][1]
}

china$VirusName <- sapply(china$VirusName, select_first, USE.NAMES = FALSE)

china$VirusName[china$VirusName == "theSARSRelatedCoronavirusHKU3"] <- "SARSRelatedCoronavirusHKU3"
china$VirusName[china$VirusName == ""] <- NA

# Merge the data frames
china <- left_join(china, kevin, by = c("VirusName" = "virusname"))
china$positive <- 0
china[!is.na(china$VirusName), "positive"] <- 1

Summary data index variable

By AnimalID: the sum of all viruses for which the var: virusname exists (else=0) (freq tbl)

viruses_per_animal <- china %>%
  group_by(AnimalID..GAINS.) %>%
  summarize(viruses_matched = length(unique(na.omit(VirusName)))) %>%
  group_by(viruses_matched) %>%
  summarize(individuals = length(viruses_matched))

viruses_per_animal$percent <- round(viruses_per_animal$individuals / sum(viruses_per_animal$individuals) * 100, 2)

knitr::kable(viruses_per_animal, col.names = c("Viruses per animal", "Number of animals", "Percentage"), caption = "Individual animals by sum of viruses")

Individual level data based on AnimalID

total_tested <- length(unique(china$AnimalID..GAINS.))
total_positive <- length(unique(china$AnimalID..GAINS.[china$positive > 0]))
total_risk1 <- length(na.omit(unique(china$AnimalID..GAINS.[china$Risklevel == 1])))

Total number of animals tested: r total_tested
Total number of animals tested virus positive r total_positive
Total number of virus-positive animals at Risklevel = 1: r total_risk1

Frequencies for total N animals tested

animals <- china %>%
  group_by(SpeciesScientificName) %>%
  summarize(SpeciesCommonNameEnglish = unique(SpeciesCommonNameEnglish),
            number = length(unique(AnimalID..GAINS.))) %>%
  arrange(desc(number))

names(animals) <- c("scientific", "common", "number")
animals$common <- sapply(animals$common, function(x) strsplit(x, " within")[[1]][1])

knitr::kable(animals, col.names = c("Scientific name", "Common name", "Number tested"), caption = "Number of individuals tested per animal species")

PIG <- china %>%
  group_by(PrimaryInterfaceGroup) %>%
  summarize(number = length(unique(AnimalID..GAINS.))) %>%
  arrange(desc(number))

PIG$percent <- round(PIG$number / sum(PIG$number) * 100, 2)

knitr::kable(PIG, col.names = c("Primary interface group", "Number of individuals", "Percentage"), caption = "Number of individuals tested per primary interface group")

SIG <- china %>%
  group_by(SecondaryInterfaceGroup) %>%
  summarize(number = length(unique(AnimalID..GAINS.))) %>%
  arrange(desc(number))

SIG$percent <- round(SIG$number / sum(SIG$number) * 100, 2)

knitr::kable(SIG, col.names = c("Secondary interface group", "Number of individuals", "Percentage"), caption = "Number of individuals tested per secondary interface group")

Cross tab among virus positive animals

china$risk1 <- 0
china$risk1[china$Risklevel == 1] <- 1

risk1_crosstab <- china %>%
  group_by(AnimalID..GAINS.) %>%
  filter(positive == 1) %>%
  summarize(risk1 = ifelse(mean(risk1) > 0, 1, 0),
            viruses_matched = length(unique(na.omit(VirusName)))) %>%
  group_by(viruses_matched) %>%
  summarize(individuals = length(viruses_matched),
            risk1 = sum(risk1))

risk1_crosstab$percent <- round(risk1_crosstab$risk1/risk1_crosstab$individuals * 100, 2)

knitr::kable(risk1_crosstab, col.names = c("Viruses", "Individuals, total", "Individuals, 'high risk'", "Percentage"), caption = "Cross tabs of number of viruses in individuals by presence of risk level 1 virus")

riskfish <- risk1_crosstab[, 2:3]
riskfish$other = riskfish$individuals - riskfish$risk1
riskfish$individuals <- NULL
riskfishtest <- fisher.test(riskfish)

Just out of interest, I ran Fisher's exact test on the data. The p-value was $r (riskfishtest$p.value)$.

Frequencies for Risklevel = 1

risk1_viruses <- china %>%
  filter(Risklevel == 1) %>%
  group_by(VirusName) %>%
  summarize(animals = length(unique(na.omit(AnimalID..GAINS.)))) %>%
  select(VirusName, animals) %>%
  arrange(desc(animals))

knitr::kable(risk1_viruses, col.names = c("Virus name", "Number of animals"), caption = "'High risk' viruses from China GAINS data")

risk1_individual <- china %>%
  group_by(AnimalID..GAINS.) %>%
  summarize(scientific = unique(SpeciesScientificName),
            common = unique(SpeciesCommonNameEnglish),
            risk1 = ifelse(mean(risk1) > 0, 1, 0),
            PIG = unique(PrimaryInterfaceGroup),
            SIG = unique(SecondaryInterfaceGroup))

risk1_species <- risk1_individual %>%
  group_by(scientific) %>%
  summarize(common = unique(common),
            animals = length(unique(AnimalID..GAINS.)),
            risk1 = length(risk1[risk1 == 1]),
            percent = round(risk1 / animals * 100, 2)) %>%
  filter(risk1 > 0) %>%
  arrange(desc(percent))

risk1_species$common <- sapply(risk1_species$common, function(x) strsplit(x, " within")[[1]][1])

knitr::kable(risk1_species, col.names = c("Scientific name", "Common name", "Tested", "'High risk'", "%"), caption = "Number of animals with 'high risk' viruses per species")

risk1_PIG <- risk1_individual %>%
  group_by(PIG) %>%
  summarize(animals = length(unique(AnimalID..GAINS.)),
            risk1 = length(risk1[risk1 == 1]),
            percent = round(risk1 / animals * 100, 2)) %>%
  arrange(desc(percent))

knitr::kable(risk1_PIG, col.names = c("Primary interface group", "Tested", "'High risk'", "%"), caption = "Number of animals with 'high risk' viruses per primary interface group")

risk1_SIG <- risk1_individual %>%
filter(!(SIG == "")) %>%
  group_by(SIG) %>%
  summarize(animals = length(unique(AnimalID..GAINS.)),
            risk1 = length(risk1[risk1 == 1]),
            percent = round(risk1 / animals * 100, 2)) %>%
  arrange(desc(percent))

knitr::kable(risk1_SIG, col.names = c("Secondary interface group", "Tested", "'High risk'", "%"), caption = "Number of animals with 'high risk' viruses per secondary interface group")

Toph's stuff.

viruses <- table(china$VirusName[!(china$VirusName == "")])
viruses <- data.frame(viruses)
viruses <- viruses[order(viruses$Freq, decreasing = TRUE), ]
names(viruses) <- c("VirusName", "freq")
viruses$VirusName <- as.character(viruses$VirusName)

knitr::kable(head(viruses, 15), col.names = c("Virus name", "Frequency"), caption = ("Top 15 viruses from GAINS in China"))

# For my own edification, another way of doing this.
# viruses <- group_by(china, VirusName)
# viruses <- summarize(viruses, freq = length(VirusName))
# viruses <- viruses[order(viruses$freq, decreasing = TRUE), ]
# knitr::kable(viruses)

Total number of viruses / total tests by risk level

prev_by_risk <- china %>% # dplyr and magrittr
  group_by(Risklevel) %>%
  summarize(total = sum(positive),
            prevalence = (sum(positive) / nrow(china)) * 100)

prev_by_risk$prevalence <- round(prev_by_risk$prevalence, 2)

knitr::kable(prev_by_risk, col.names = c("Risk level", "Total", "Prevalence"), caption = "Sample prevalence of viruses by risk level")

Sum of viruses by animal ID

A table of the number of total matches for each animal.

prev_by_animal <- china %>%
  # filter(positive > 0) %>%
  group_by(AnimalID..GAINS.) %>%
  summarize(total = sum(positive)) %>%
  arrange(desc(total))

prev_by_animal <- data.frame(table(prev_by_animal$total))
prev_by_animal$Var1 <- as.numeric(as.character(prev_by_animal$Var1))
prev_by_animal$Perc <- round(prev_by_animal$Freq / sum(prev_by_animal$Freq) * 100, 2)

knitr::kable(prev_by_animal, col.names = c("Number of viruses", "Frequency in animals", "Percentage of animals"), caption = "Frequency of animals with 1, 2, and 3 matched viruses")

Species-level summaries

N.B. Kevin says that it's best to use the AVE_Manual variable, as Risklevel is really based on arbitrary cutoffs and was created for USAID; better to preserve the continuous nature of the ranking if we don't need to do stuff categorical.

There are r nrow(unique(china[, c("SpeciesScientificName", "SpeciesCommonNameEnglish")])) species in the database.

The following table shows the highest risk level of virus found in each species.

by_species <- china %>%
  group_by(SpeciesScientificName) %>%
  summarize(highest_risklevel = min(na.omit(Risklevel))) %>%
  arrange(highest_risklevel)

by_species$highest_risklevel[is.infinite(by_species$highest_risklevel)] <- NA

knitr::kable(by_species, col.namse = c("Species", "Any match", "Highest risk level of positive match"), caption = "Summaries of virus matches by species")

Note: A risk level of NA indicates no matches in that species.

I'm not actually sure how useful the next table will be, but I wanted to get a measure of virodiversity for tested species. The table shows the mean number of virus species found in individual animals (across all specimens, which I'm sure introduces bias—I'm sure Simon could tell us about that) for each species.

mean_viruses_by_species <- china %>%
  group_by(AnimalID..GAINS.) %>%
  summarize(num_species = length(unique(na.omit(VirusName))),
            SpeciesScientificName = SpeciesScientificName[1]) %>%
  group_by(SpeciesScientificName) %>%
  summarize(num_tested = length(AnimalID..GAINS.),
            mean_species = sum(num_species)/num_tested) %>%
  arrange(desc(mean_species)) %>%
  filter(mean_species > 0.0)

knitr::kable(mean_viruses_by_species, col.names = c("Species", "Number of animals tested", "Mean virus species"), caption = "Mean number of virus species found per individual for different species")

And the same for taxonomic group:

mean_viruses_by_species <- china %>%
  group_by(AnimalID..GAINS.) %>%
  summarize(num_species = length(unique(na.omit(VirusName))),
            SpeciesScientificName = SpeciesScientificName[1]) %>%
  group_by(SpeciesScientificName) %>%
  summarize(num_tested = length(AnimalID..GAINS.),
            mean_species = sum(num_species)/num_tested) %>%
  arrange(desc(mean_species)) %>%
  filter(mean_species > 0.0)

knitr::kable(mean_viruses_by_species, col.names = c("Species", "Number of animals tested", "Mean virus species"), caption = "Mean number of virus species found per individual for different species")